I am interested in the idea of commuting for work as it relates to different socio-economic indicators. Commuting here is defined as trip to work using public transportation, biking, and walking. Here I explore this by looking at the commuting information from 2013-2017 American Community Survey (ACS) and socio-demog information, I aggregated this information at the Census Tract level.

First we read-in our data downloaded from the Florida Geographic Data Library.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(sf)
## Linking to GEOS 3.6.1, GDAL 2.1.3, PROJ 4.9.3
library(sp)
library(spdep)
## Loading required package: Matrix
## Loading required package: spData
library(spatstat)
## Loading required package: spatstat.data
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse
## Loading required package: rpart
## 
## spatstat 1.58-2       (nickname: 'Not Even Wrong') 
## For an introduction to spatstat, type 'beginner'
## 
## Note: spatstat version 1.58-2 is out of date by more than 3 months; we recommend upgrading to the latest version.
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following objects are masked from 'package:spatstat':
## 
##     bc, ellipse
## The following object is masked from 'package:dplyr':
## 
##     recode
library(ggplot2)
library(tmap)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
CEN.sf <- read_sf(dsn = "centract_2017d") %>%
  mutate(COMMUTE = SUM_TRAN_P+SUM_TRAN_W+SUM_TRAN_B, PCT_POV = (SUM_BELOW_/SUM_ABVE_B)*100, PCT_WHITE = (SUM_WHITE_/SUM_TOTALP)*100, PCT_BLACK = (SUM_BLACK/SUM_TOTALP)*100, PCT_HISP = (SUM_HISPAN/SUM_TOTALP)*100, PCT_COMMUTE = (COMMUTE/SUM_TRAN_T)*100, POPDEN = SUM_TOTALP/Shape_Area)

Visualize data using Tmap.

tm_shape(CEN.sf) +
  tm_fill("COMMUTE") +
  tm_borders(col = "grey40", lwd = .1, lty = "solid", alpha = 0.1) +
  tm_layout(title = "Commute to Work")

sm::sm.density(CEN.sf$COMMUTE, 
               model = "Normal",
               xlab = "")

x <- CEN.sf$COMMUTE != 0
e <- min(CEN.sf$COMMUTE[x])
CEN.sf$COMMUTE[!x] <- e
CEN.sf$logCOMMUTE <- log(CEN.sf$COMMUTE)

tm_shape(CEN.sf) +
  tm_fill("logCOMMUTE") +
  tm_borders(col = "grey40", lwd = .1, lty = "solid", alpha = 0.1) +
  tm_layout(title = "Log Commute to Work")

sm::sm.density(CEN.sf$logCOMMUTE, 
               model = "Normal",
               xlab = "")

Develop a Linear Model.

model <- lm(logCOMMUTE ~ PCT_POV + SUM_ED_LES + SUM_ED_12N + SUM_HSGRAD + SUM_ED_COL + SUM_BACHEL + MEAN_MEDHH + SUM_OWNER + SUM_RENTER + SUM_M15 + MEAN_MEDOO + SUM_MINORI + SUM_LABOR_ + PCT_BLACK + PCT_WHITE + PCT_HISP + SUM_VEHICL + SUM_VEHI_1 + POPDEN, data = CEN.sf)

summary(model)
## 
## Call:
## lm(formula = logCOMMUTE ~ PCT_POV + SUM_ED_LES + SUM_ED_12N + 
##     SUM_HSGRAD + SUM_ED_COL + SUM_BACHEL + MEAN_MEDHH + SUM_OWNER + 
##     SUM_RENTER + SUM_M15 + MEAN_MEDOO + SUM_MINORI + SUM_LABOR_ + 
##     PCT_BLACK + PCT_WHITE + PCT_HISP + SUM_VEHICL + SUM_VEHI_1 + 
##     POPDEN, data = CEN.sf)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.6152 -0.3296  0.0788  0.4354  2.7737 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.723e+00  4.821e-01  13.945  < 2e-16 ***
## PCT_POV      2.344e-02  2.669e-03   8.782  < 2e-16 ***
## SUM_ED_LES   2.797e-04  3.453e-05   8.102 7.04e-16 ***
## SUM_ED_12N  -1.004e-04  3.963e-05  -2.533  0.01135 *  
## SUM_HSGRAD   2.835e-05  1.699e-05   1.668  0.09529 .  
## SUM_ED_COL   7.955e-05  3.089e-05   2.575  0.01005 *  
## SUM_BACHEL  -1.364e-04  5.130e-05  -2.659  0.00786 ** 
## MEAN_MEDHH  -3.244e-06  1.438e-06  -2.256  0.02411 *  
## SUM_OWNER   -4.058e-04  3.722e-05 -10.904  < 2e-16 ***
## SUM_RENTER  -4.217e-04  4.501e-05  -9.369  < 2e-16 ***
## SUM_M15      1.774e-04  6.033e-05   2.940  0.00330 ** 
## MEAN_MEDOO   2.135e-06  1.629e-07  13.110  < 2e-16 ***
## SUM_MINORI  -6.854e-05  5.383e-06 -12.733  < 2e-16 ***
## SUM_LABOR_   2.551e-04  1.418e-05  17.989  < 2e-16 ***
## PCT_BLACK   -1.872e-02  4.808e-03  -3.894  0.00010 ***
## PCT_WHITE   -3.561e-02  4.906e-03  -7.257 4.67e-13 ***
## PCT_HISP    -2.509e-02  4.726e-03  -5.308 1.16e-07 ***
## SUM_VEHICL   1.143e-03  6.272e-05  18.215  < 2e-16 ***
## SUM_VEHI_1   3.954e-04  2.813e-05  14.054  < 2e-16 ***
## POPDEN       7.040e+00  2.292e+00   3.071  0.00215 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7241 on 4224 degrees of freedom
## Multiple R-squared:  0.5569, Adjusted R-squared:  0.5549 
## F-statistic: 279.4 on 19 and 4224 DF,  p-value: < 2.2e-16
#simplify the Model

step(model, 
     direction = "both")
## Start:  AIC=-2720.44
## logCOMMUTE ~ PCT_POV + SUM_ED_LES + SUM_ED_12N + SUM_HSGRAD + 
##     SUM_ED_COL + SUM_BACHEL + MEAN_MEDHH + SUM_OWNER + SUM_RENTER + 
##     SUM_M15 + MEAN_MEDOO + SUM_MINORI + SUM_LABOR_ + PCT_BLACK + 
##     PCT_WHITE + PCT_HISP + SUM_VEHICL + SUM_VEHI_1 + POPDEN
## 
##              Df Sum of Sq    RSS     AIC
## <none>                    2214.6 -2720.4
## - SUM_HSGRAD  1     1.460 2216.1 -2719.6
## - MEAN_MEDHH  1     2.669 2217.3 -2717.3
## - SUM_ED_12N  1     3.363 2218.0 -2716.0
## - SUM_ED_COL  1     3.477 2218.1 -2715.8
## - SUM_BACHEL  1     3.708 2218.3 -2715.3
## - SUM_M15     1     4.531 2219.1 -2713.8
## - POPDEN      1     4.944 2219.6 -2713.0
## - PCT_BLACK   1     7.951 2222.6 -2707.2
## - PCT_HISP    1    14.773 2229.4 -2694.2
## - PCT_WHITE   1    27.614 2242.2 -2669.8
## - SUM_ED_LES  1    34.412 2249.0 -2657.0
## - PCT_POV     1    40.436 2255.0 -2645.7
## - SUM_RENTER  1    46.024 2260.6 -2635.1
## - SUM_OWNER   1    62.337 2276.9 -2604.6
## - SUM_MINORI  1    84.998 2299.6 -2562.6
## - MEAN_MEDOO  1    90.108 2304.7 -2553.2
## - SUM_VEHI_1  1   103.561 2318.2 -2528.5
## - SUM_LABOR_  1   169.669 2384.3 -2409.1
## - SUM_VEHICL  1   173.954 2388.6 -2401.5
## 
## Call:
## lm(formula = logCOMMUTE ~ PCT_POV + SUM_ED_LES + SUM_ED_12N + 
##     SUM_HSGRAD + SUM_ED_COL + SUM_BACHEL + MEAN_MEDHH + SUM_OWNER + 
##     SUM_RENTER + SUM_M15 + MEAN_MEDOO + SUM_MINORI + SUM_LABOR_ + 
##     PCT_BLACK + PCT_WHITE + PCT_HISP + SUM_VEHICL + SUM_VEHI_1 + 
##     POPDEN, data = CEN.sf)
## 
## Coefficients:
## (Intercept)      PCT_POV   SUM_ED_LES   SUM_ED_12N   SUM_HSGRAD  
##   6.723e+00    2.344e-02    2.797e-04   -1.004e-04    2.835e-05  
##  SUM_ED_COL   SUM_BACHEL   MEAN_MEDHH    SUM_OWNER   SUM_RENTER  
##   7.955e-05   -1.364e-04   -3.244e-06   -4.058e-04   -4.217e-04  
##     SUM_M15   MEAN_MEDOO   SUM_MINORI   SUM_LABOR_    PCT_BLACK  
##   1.774e-04    2.135e-06   -6.854e-05    2.551e-04   -1.872e-02  
##   PCT_WHITE     PCT_HISP   SUM_VEHICL   SUM_VEHI_1       POPDEN  
##  -3.561e-02   -2.509e-02    1.143e-03    3.954e-04    7.040e+00
model2 <- lm(logCOMMUTE ~ PCT_POV + SUM_ED_LES +SUM_ED_12N + SUM_HSGRAD + SUM_ED_COL + SUM_BACHEL + MEAN_MEDHH + SUM_OWNER + SUM_RENTER + SUM_M15 + MEAN_MEDOO + SUM_MINORI + SUM_LABOR_ + PCT_BLACK + PCT_WHITE + PCT_HISP + SUM_VEHICL + SUM_VEHI_1 + POPDEN, data = CEN.sf)

summary(model2)
## 
## Call:
## lm(formula = logCOMMUTE ~ PCT_POV + SUM_ED_LES + SUM_ED_12N + 
##     SUM_HSGRAD + SUM_ED_COL + SUM_BACHEL + MEAN_MEDHH + SUM_OWNER + 
##     SUM_RENTER + SUM_M15 + MEAN_MEDOO + SUM_MINORI + SUM_LABOR_ + 
##     PCT_BLACK + PCT_WHITE + PCT_HISP + SUM_VEHICL + SUM_VEHI_1 + 
##     POPDEN, data = CEN.sf)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.6152 -0.3296  0.0788  0.4354  2.7737 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.723e+00  4.821e-01  13.945  < 2e-16 ***
## PCT_POV      2.344e-02  2.669e-03   8.782  < 2e-16 ***
## SUM_ED_LES   2.797e-04  3.453e-05   8.102 7.04e-16 ***
## SUM_ED_12N  -1.004e-04  3.963e-05  -2.533  0.01135 *  
## SUM_HSGRAD   2.835e-05  1.699e-05   1.668  0.09529 .  
## SUM_ED_COL   7.955e-05  3.089e-05   2.575  0.01005 *  
## SUM_BACHEL  -1.364e-04  5.130e-05  -2.659  0.00786 ** 
## MEAN_MEDHH  -3.244e-06  1.438e-06  -2.256  0.02411 *  
## SUM_OWNER   -4.058e-04  3.722e-05 -10.904  < 2e-16 ***
## SUM_RENTER  -4.217e-04  4.501e-05  -9.369  < 2e-16 ***
## SUM_M15      1.774e-04  6.033e-05   2.940  0.00330 ** 
## MEAN_MEDOO   2.135e-06  1.629e-07  13.110  < 2e-16 ***
## SUM_MINORI  -6.854e-05  5.383e-06 -12.733  < 2e-16 ***
## SUM_LABOR_   2.551e-04  1.418e-05  17.989  < 2e-16 ***
## PCT_BLACK   -1.872e-02  4.808e-03  -3.894  0.00010 ***
## PCT_WHITE   -3.561e-02  4.906e-03  -7.257 4.67e-13 ***
## PCT_HISP    -2.509e-02  4.726e-03  -5.308 1.16e-07 ***
## SUM_VEHICL   1.143e-03  6.272e-05  18.215  < 2e-16 ***
## SUM_VEHI_1   3.954e-04  2.813e-05  14.054  < 2e-16 ***
## POPDEN       7.040e+00  2.292e+00   3.071  0.00215 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7241 on 4224 degrees of freedom
## Multiple R-squared:  0.5569, Adjusted R-squared:  0.5549 
## F-statistic: 279.4 on 19 and 4224 DF,  p-value: < 2.2e-16
drop1(model2)
## Single term deletions
## 
## Model:
## logCOMMUTE ~ PCT_POV + SUM_ED_LES + SUM_ED_12N + SUM_HSGRAD + 
##     SUM_ED_COL + SUM_BACHEL + MEAN_MEDHH + SUM_OWNER + SUM_RENTER + 
##     SUM_M15 + MEAN_MEDOO + SUM_MINORI + SUM_LABOR_ + PCT_BLACK + 
##     PCT_WHITE + PCT_HISP + SUM_VEHICL + SUM_VEHI_1 + POPDEN
##            Df Sum of Sq    RSS     AIC
## <none>                  2214.6 -2720.4
## PCT_POV     1    40.436 2255.0 -2645.7
## SUM_ED_LES  1    34.412 2249.0 -2657.0
## SUM_ED_12N  1     3.363 2218.0 -2716.0
## SUM_HSGRAD  1     1.460 2216.1 -2719.6
## SUM_ED_COL  1     3.477 2218.1 -2715.8
## SUM_BACHEL  1     3.708 2218.3 -2715.3
## MEAN_MEDHH  1     2.669 2217.3 -2717.3
## SUM_OWNER   1    62.337 2276.9 -2604.6
## SUM_RENTER  1    46.024 2260.6 -2635.1
## SUM_M15     1     4.531 2219.1 -2713.8
## MEAN_MEDOO  1    90.108 2304.7 -2553.2
## SUM_MINORI  1    84.998 2299.6 -2562.6
## SUM_LABOR_  1   169.669 2384.3 -2409.1
## PCT_BLACK   1     7.951 2222.6 -2707.2
## PCT_WHITE   1    27.614 2242.2 -2669.8
## PCT_HISP    1    14.773 2229.4 -2694.2
## SUM_VEHICL  1   173.954 2388.6 -2401.5
## SUM_VEHI_1  1   103.561 2318.2 -2528.5
## POPDEN      1     4.944 2219.6 -2713.0
step(model2, 
     direction = "both")
## Start:  AIC=-2720.44
## logCOMMUTE ~ PCT_POV + SUM_ED_LES + SUM_ED_12N + SUM_HSGRAD + 
##     SUM_ED_COL + SUM_BACHEL + MEAN_MEDHH + SUM_OWNER + SUM_RENTER + 
##     SUM_M15 + MEAN_MEDOO + SUM_MINORI + SUM_LABOR_ + PCT_BLACK + 
##     PCT_WHITE + PCT_HISP + SUM_VEHICL + SUM_VEHI_1 + POPDEN
## 
##              Df Sum of Sq    RSS     AIC
## <none>                    2214.6 -2720.4
## - SUM_HSGRAD  1     1.460 2216.1 -2719.6
## - MEAN_MEDHH  1     2.669 2217.3 -2717.3
## - SUM_ED_12N  1     3.363 2218.0 -2716.0
## - SUM_ED_COL  1     3.477 2218.1 -2715.8
## - SUM_BACHEL  1     3.708 2218.3 -2715.3
## - SUM_M15     1     4.531 2219.1 -2713.8
## - POPDEN      1     4.944 2219.6 -2713.0
## - PCT_BLACK   1     7.951 2222.6 -2707.2
## - PCT_HISP    1    14.773 2229.4 -2694.2
## - PCT_WHITE   1    27.614 2242.2 -2669.8
## - SUM_ED_LES  1    34.412 2249.0 -2657.0
## - PCT_POV     1    40.436 2255.0 -2645.7
## - SUM_RENTER  1    46.024 2260.6 -2635.1
## - SUM_OWNER   1    62.337 2276.9 -2604.6
## - SUM_MINORI  1    84.998 2299.6 -2562.6
## - MEAN_MEDOO  1    90.108 2304.7 -2553.2
## - SUM_VEHI_1  1   103.561 2318.2 -2528.5
## - SUM_LABOR_  1   169.669 2384.3 -2409.1
## - SUM_VEHICL  1   173.954 2388.6 -2401.5
## 
## Call:
## lm(formula = logCOMMUTE ~ PCT_POV + SUM_ED_LES + SUM_ED_12N + 
##     SUM_HSGRAD + SUM_ED_COL + SUM_BACHEL + MEAN_MEDHH + SUM_OWNER + 
##     SUM_RENTER + SUM_M15 + MEAN_MEDOO + SUM_MINORI + SUM_LABOR_ + 
##     PCT_BLACK + PCT_WHITE + PCT_HISP + SUM_VEHICL + SUM_VEHI_1 + 
##     POPDEN, data = CEN.sf)
## 
## Coefficients:
## (Intercept)      PCT_POV   SUM_ED_LES   SUM_ED_12N   SUM_HSGRAD  
##   6.723e+00    2.344e-02    2.797e-04   -1.004e-04    2.835e-05  
##  SUM_ED_COL   SUM_BACHEL   MEAN_MEDHH    SUM_OWNER   SUM_RENTER  
##   7.955e-05   -1.364e-04   -3.244e-06   -4.058e-04   -4.217e-04  
##     SUM_M15   MEAN_MEDOO   SUM_MINORI   SUM_LABOR_    PCT_BLACK  
##   1.774e-04    2.135e-06   -6.854e-05    2.551e-04   -1.872e-02  
##   PCT_WHITE     PCT_HISP   SUM_VEHICL   SUM_VEHI_1       POPDEN  
##  -3.561e-02   -2.509e-02    1.143e-03    3.954e-04    7.040e+00
#check for multicollinearity

car::vif(model2)
##    PCT_POV SUM_ED_LES SUM_ED_12N SUM_HSGRAD SUM_ED_COL SUM_BACHEL 
##   4.119235   4.162464   5.653405 108.557802  74.240281  79.948777 
## MEAN_MEDHH  SUM_OWNER SUM_RENTER    SUM_M15 MEAN_MEDOO SUM_MINORI 
##   6.146399 100.967395  42.096742   9.962975   3.634743  14.613441 
## SUM_LABOR_  PCT_BLACK  PCT_WHITE   PCT_HISP SUM_VEHICL SUM_VEHI_1 
##  44.309295  59.248033 133.230025  86.359537   3.566154  16.931507 
##     POPDEN 
##   1.755613
model2 <- lm(logCOMMUTE ~ PCT_POV + SUM_ED_LES + MEAN_MEDOO + PCT_HISP + SUM_VEHICL + POPDEN, data = CEN.sf)

car::vif(model2)
##    PCT_POV SUM_ED_LES MEAN_MEDOO   PCT_HISP SUM_VEHICL     POPDEN 
##   1.867784   2.094287   1.455286   2.322536   1.717280   1.590927
step(model2, 
     direction = "both")
## Start:  AIC=-1579.11
## logCOMMUTE ~ PCT_POV + SUM_ED_LES + MEAN_MEDOO + PCT_HISP + SUM_VEHICL + 
##     POPDEN
## 
##              Df Sum of Sq    RSS      AIC
## <none>                    2915.7 -1579.11
## - PCT_HISP    1      7.19 2922.9 -1570.66
## - SUM_ED_LES  1     41.05 2956.8 -1521.78
## - POPDEN      1     55.86 2971.6 -1500.58
## - PCT_POV     1     74.23 2990.0 -1474.42
## - MEAN_MEDOO  1    136.69 3052.4 -1386.67
## - SUM_VEHICL  1    500.83 3416.6  -908.38
## 
## Call:
## lm(formula = logCOMMUTE ~ PCT_POV + SUM_ED_LES + MEAN_MEDOO + 
##     PCT_HISP + SUM_VEHICL + POPDEN, data = CEN.sf)
## 
## Coefficients:
## (Intercept)      PCT_POV   SUM_ED_LES   MEAN_MEDOO     PCT_HISP  
##   3.729e+00    2.138e-02    2.167e-04    1.664e-06    2.870e-03  
##  SUM_VEHICL       POPDEN  
##   1.345e-03    2.253e+01
summary(model2)
## 
## Call:
## lm(formula = logCOMMUTE ~ PCT_POV + SUM_ED_LES + MEAN_MEDOO + 
##     PCT_HISP + SUM_VEHICL + POPDEN, data = CEN.sf)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.4686 -0.4033  0.1032  0.5329  2.6347 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3.729e+00  4.528e-02  82.363  < 2e-16 ***
## PCT_POV     2.138e-02  2.059e-03  10.386  < 2e-16 ***
## SUM_ED_LES  2.167e-04  2.806e-05   7.723  1.4e-14 ***
## MEAN_MEDOO  1.664e-06  1.181e-07  14.094  < 2e-16 ***
## PCT_HISP    2.870e-03  8.880e-04   3.232  0.00124 ** 
## SUM_VEHICL  1.345e-03  4.987e-05  26.977  < 2e-16 ***
## POPDEN      2.253e+01  2.500e+00   9.009  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8296 on 4237 degrees of freedom
## Multiple R-squared:  0.4166, Adjusted R-squared:  0.4158 
## F-statistic: 504.3 on 6 and 4237 DF,  p-value: < 2.2e-16

Thinking about spatial autocorrelation and fitting a spatial regression model.

Define neighboorhood

CEN.sp <- as(CEN.sf, "Spatial")

nbs <- poly2nb(CEN.sp)

plot(nbs, 
     coordinates(CEN.sp))

wts <- nb2listw(nbs)

m <- length(nbs)
s <- Szero(wts)

Test for Autocorrelation

#on the response variable

moran(CEN.sp$logCOMMUTE, 
      listw = wts, 
      n = m, 
      S0 = s)
## $I
## [1] 0.5952112
## 
## $K
## [1] 6.030541
#0.59 Moran's I value is high denoting fairly high spatial autocorrelation
#Kurtosis of is fairly high (6) so the inference maybe suspect

geary(CEN.sp$logCOMMUTE, 
      listw = wts,
      n = m, 
      S0 = s, 
      n1 = m - 1)
## $C
## [1] 0.373284
## 
## $K
## [1] 6.030541
#Geary C value less than one (0.37) indicates spatial autocorrelation

moran.test(CEN.sp$logCOMMUTE, 
           wts)
## 
##  Moran I test under randomisation
## 
## data:  CEN.sp$logCOMMUTE  
## weights: wts    
## 
## Moran I statistic standard deviate = 67.937, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##      5.952112e-01     -2.356823e-04      7.682078e-05
##Highly significant spatial autocorrelation although normality is suspect based on the previous density plot of logCOMMUTE. Lets test using Monte Carlo approach.

set.seed(4102)
mP <- moran.mc(CEN.sp$logCOMMUTE, 
               listw = wts,
               nsim = 99)
mP
## 
##  Monte-Carlo simulation of Moran I
## 
## data:  CEN.sp$logCOMMUTE 
## weights: wts  
## number of simulations + 1: 100 
## 
## statistic = 0.59521, observed rank = 100, p-value = 0.01
## alternative hypothesis: greater
#The results confirm that there is significant spatial autocorrelation


#CHECK Residuals from the Model

res <- residuals(model2)

sm::sm.density(res, 
               model = "Normal")

CEN.sf$res <- res

tm_shape(CEN.sf) +
  tm_fill("res") +
  tm_borders(col = "gray70") +
  tm_layout(title = "Linear Model Residuals")
## Variable "res" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.

There are clustered regions where the model over predicts logCommute based on explanatory variables (red and orange regions) and where it under predicts logCommute (green).

Fit a Spatial Regression Model

#Test spatial regression fit

lm.LMtests(model2, 
           listw = wts, 
           test = c("LMerr", "LMlag"))
## 
##  Lagrange multiplier diagnostics for spatial dependence
## 
## data:  
## model: lm(formula = logCOMMUTE ~ PCT_POV + SUM_ED_LES + MEAN_MEDOO
## + PCT_HISP + SUM_VEHICL + POPDEN, data = CEN.sf)
## weights: wts
## 
## LMerr = 2941.3, df = 1, p-value < 2.2e-16
## 
## 
##  Lagrange multiplier diagnostics for spatial dependence
## 
## data:  
## model: lm(formula = logCOMMUTE ~ PCT_POV + SUM_ED_LES + MEAN_MEDOO
## + PCT_HISP + SUM_VEHICL + POPDEN, data = CEN.sf)
## weights: wts
## 
## LMlag = 2429, df = 1, p-value < 2.2e-16
#More robust test

lm.LMtests(model2,
           listw = wts,
           test = c("RLMerr", "RLMlag"))
## 
##  Lagrange multiplier diagnostics for spatial dependence
## 
## data:  
## model: lm(formula = logCOMMUTE ~ PCT_POV + SUM_ED_LES + MEAN_MEDOO
## + PCT_HISP + SUM_VEHICL + POPDEN, data = CEN.sf)
## weights: wts
## 
## RLMerr = 535.8, df = 1, p-value < 2.2e-16
## 
## 
##  Lagrange multiplier diagnostics for spatial dependence
## 
## data:  
## model: lm(formula = logCOMMUTE ~ PCT_POV + SUM_ED_LES + MEAN_MEDOO
## + PCT_HISP + SUM_VEHICL + POPDEN, data = CEN.sf)
## weights: wts
## 
## RLMlag = 23.472, df = 1, p-value = 1.267e-06
#Spatial Lag

model.lag <- lagsarlm(logCOMMUTE ~ PCT_POV + SUM_ED_LES + MEAN_MEDOO + PCT_HISP + SUM_VEHICL + POPDEN, 
                      data = CEN.sf, 
                      listw = wts, tol.solve=1.0e-20)

summary(model.lag)
## 
## Call:lagsarlm(formula = logCOMMUTE ~ PCT_POV + SUM_ED_LES + MEAN_MEDOO + 
##     PCT_HISP + SUM_VEHICL + POPDEN, data = CEN.sf, listw = wts, 
##     tol.solve = 1e-20)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -5.102405 -0.268024  0.077845  0.378581  2.250328 
## 
## Type: lag 
## Coefficients: (asymptotic standard errors) 
##                Estimate  Std. Error z value  Pr(>|z|)
## (Intercept)  8.4217e-01  5.5110e-02 15.2815 < 2.2e-16
## PCT_POV      1.1582e-03  1.5778e-03  0.7341   0.46291
## SUM_ED_LES   2.1994e-04  2.1436e-05 10.2603 < 2.2e-16
## MEAN_MEDOO   6.1309e-07  9.1012e-08  6.7364 1.624e-11
## PCT_HISP    -1.4432e-03  6.7172e-04 -2.1485   0.03168
## SUM_VEHICL   8.5580e-04  4.0187e-05 21.2954 < 2.2e-16
## POPDEN       3.7159e+00  1.9075e+00  1.9480   0.05142
## 
## Rho: 0.70578, LR test value: 1936.7, p-value: < 2.22e-16
## Asymptotic standard error: 0.011886
##     z-value: 59.379, p-value: < 2.22e-16
## Wald statistic: 3525.9, p-value: < 2.22e-16
## 
## Log likelihood: -4257.078 for lag model
## ML residual variance (sigma squared): 0.39228, (sigma: 0.62633)
## Number of observations: 4244 
## Number of parameters estimated: 9 
## AIC: 8532.2, (AIC for lm: 10467)
## LM test for residual autocorrelation
## test value: 103.74, p-value: < 2.22e-16
spdep::impacts(model.lag, 
               listw = wts)
## Impact measures (lag, exact):
##                   Direct      Indirect         Total
## PCT_POV     1.311608e-03  2.624848e-03  3.936455e-03
## SUM_ED_LES  2.490761e-04  4.984621e-04  7.475382e-04
## MEAN_MEDOO  6.943041e-07  1.389472e-06  2.083776e-06
## PCT_HISP   -1.634339e-03 -3.270711e-03 -4.905050e-03
## SUM_VEHICL  9.691617e-04  1.939529e-03  2.908691e-03
## POPDEN      4.208085e+00  8.421407e+00  1.262949e+01
#How big is the improvement from the Linear model, .46 which is considered large.

x <- 2 * (logLik(model.lag) - logLik(model2))/4244
x[1]
## [1] 0.4563346

1 | huge, .1 | large, .01 | good, .001 | okay

Compare with an spatial error model

model.error <- errorsarlm(logCOMMUTE ~ PCT_POV + SUM_ED_LES + MEAN_MEDOO + PCT_HISP + SUM_VEHICL + POPDEN, 
                          data = CEN.sf,
                          listw = wts, tol.solve=1.0e-20)

summary(model.error)
## 
## Call:
## errorsarlm(formula = logCOMMUTE ~ PCT_POV + SUM_ED_LES + MEAN_MEDOO + 
##     PCT_HISP + SUM_VEHICL + POPDEN, data = CEN.sf, listw = wts, 
##     tol.solve = 1e-20)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -4.821058 -0.230161  0.087165  0.348925  2.108316 
## 
## Type: error 
## Coefficients: (asymptotic standard errors) 
##               Estimate Std. Error z value  Pr(>|z|)
## (Intercept) 3.9711e+00 8.5045e-02 46.6943 < 2.2e-16
## PCT_POV     9.6354e-03 2.6952e-03  3.5750 0.0003502
## SUM_ED_LES  4.3118e-04 3.2407e-05 13.3051 < 2.2e-16
## MEAN_MEDOO  7.5151e-07 1.8133e-07  4.1445 3.406e-05
## PCT_HISP    3.3347e-03 1.6663e-03  2.0012 0.0453656
## SUM_VEHICL  1.1295e-03 5.2315e-05 21.5907 < 2.2e-16
## POPDEN      5.1526e+00 2.3558e+00  2.1872 0.0287288
## 
## Lambda: 0.81072, LR test value: 2267.4, p-value: < 2.22e-16
## Asymptotic standard error: 0.011185
##     z-value: 72.484, p-value: < 2.22e-16
## Wald statistic: 5253.9, p-value: < 2.22e-16
## 
## Log likelihood: -4091.724 for error model
## ML residual variance (sigma squared): 0.3463, (sigma: 0.58847)
## Number of observations: 4244 
## Number of parameters estimated: 9 
## AIC: 8201.4, (AIC for lm: 10467)
x <- 2 * (logLik(model.error) - logLik(model2))/4244
x[1]
## [1] 0.5342582
#The improvement from the linear model is .53 which is considered large.

Predictions

The predict() method implements the predict.errorsarlm() function to calculate predictions from the spatial regression model. The prediction is decomposed into a “trend” term (explanatory variable effect) and a “signal” term (spatial smoother). The fit is the sum of the trend and the signal terms.

We make predictions at the values of the explanatory variables using the spatial lag model as

pre <- predict(model.error)
str(pre)
##  'sarlm.pred' Named num [1:4244] 5.18 5.23 6.01 6.72 5.94 ...
##  - attr(*, "names")= chr [1:4244] "1" "2" "3" "4" ...
##  - attr(*, "trend")= num [1:4244] 4.82 5.13 5.85 6.37 5.62 ...
##  - attr(*, "signal")= num [1:4244] 0.3593 0.0989 0.1571 0.3493 0.3135 ...
##  - attr(*, "region.id")= chr [1:4244] "1" "2" "3" "4" ...
##  - attr(*, "pred.type")= chr "TS"
##  - attr(*, "call")= language predict.sarlm(object = model.error)
CEN.sp$logCOMMUTE[1:5]
## [1] 5.075174 5.303305 6.406880 5.902633 5.913503

The predictions are added to the spatial polygons data frame.

CEN.sf$fit <- as.numeric(pre)
CEN.sf$trend <- attr(pre, "trend")
CEN.sf$signal <- attr(pre, "signal")

The predictions are mapped.

( g3 <- ggplot() +
    geom_sf(data = CEN.sf, aes(fill = fit)) +
    scale_fill_gradient(low = "white", high = "green") +
    ggtitle("Predicted Commute(log)", 
          subtitle = "Number of Workers that Commutes(log)") )

( g4 <- ggplot() +
    geom_sf(data = CEN.sf, aes(fill = trend)) +
    scale_fill_gradient(low = "white", high = "orange") +
    ggtitle("Trend (Covariate)") )

( g5 <- ggplot() +
    geom_sf(data = CEN.sf, aes(fill = signal)) +
    scale_fill_gradient(low = "white", high = "blue") +
    ggtitle("Signal") )

grid.arrange(g3, g4, g5, nrow = 1)

THINKING ABOUT SCALE and SCOPE

I was wondering how the scale and region of analysis affects the relationship so after looking at the state level, I decided to explore the county level choosing Duval County which includes Jacksonville City, the most populous City in Florida and largest in terms of land area in the US. More importantly, it has a franchise of Jollibee - a Filipino restaurant that I missed more than anything.

First we read-in our data downloaded from the Florida Geographic Data Library.

Duval.sf <- read_sf(dsn = "DuvalCountyC") %>% 
  mutate(COMMUTE = SUM_TRAN_P+SUM_TRAN_W+SUM_TRAN_B, PCT_POV = (SUM_BELOW_/SUM_ABVE_B)*100, PCT_WHITE = (SUM_WHITE_/SUM_TOTALP)*100, PCT_BLACK = (SUM_BLACK/SUM_TOTALP)*100, PCT_HISP = (SUM_HISPAN/SUM_TOTALP)*100, PCT_COMMUTE = (COMMUTE/SUM_TRAN_T)*100, POPDEN = SUM_TOTALP/(Shape_Area/1000000))

Visualize them using Tmap.

tm_shape(Duval.sf) +
  tm_fill("COMMUTE") +
  tm_borders(col = "grey40", lwd = .4, lty = "solid", alpha = 1) +
  tm_layout(title = "Commute to Work")

sm::sm.density(Duval.sf$COMMUTE, 
               model = "Normal",
               xlab = "")

sm::sm.density(Duval.sf$PCT_COMMUTE, 
               model = "Normal",
               xlab = "")

x <- Duval.sf$COMMUTE != 0
e <- min(Duval.sf$COMMUTE[x])
Duval.sf$COMMUTE[!x] <- e
Duval.sf$logCOMMUTE <- log(Duval.sf$COMMUTE)

tm_shape(Duval.sf) +
  tm_fill("logCOMMUTE") +
  tm_borders(col = "grey40", lwd = .1, lty = "solid", alpha = 0.1) +
  tm_layout(title = "Log Commute to Work")

sm::sm.density(Duval.sf$logCOMMUTE, 
               model = "Normal",
               xlab = "")

Use our Linear Model

 modelD <- lm(logCOMMUTE ~ PCT_POV + SUM_ED_LES + MEAN_MEDOO + PCT_HISP + SUM_VEHICL + POPDEN, data = Duval.sf)

Define neighboorhood

library(spdep)
library(spatstat)

Duval.sp <- as(Duval.sf, "Spatial")

nbs <- poly2nb(Duval.sp)

plot(nbs, 
     coordinates(Duval.sp))

wts <- nb2listw(nbs)

m <- length(nbs)
s <- Szero(wts)

Test for Autocorrelation

moran(Duval.sp$logCOMMUTE, 
      listw = wts, 
      n = m, 
      S0 = s)
## $I
## [1] 0.3228592
## 
## $K
## [1] 7.327067
#0.32 Moran's I value is denoting fairly high spatial autocorrelation
#Kurtosis is high (7), questioning normality, so the inference maybe suspect

geary(Duval.sp$logCOMMUTE, 
      listw = wts,
      n = m, 
      S0 = s, 
      n1 = m - 1)
## $C
## [1] 0.6358107
## 
## $K
## [1] 7.327067
#Geary C value less than one (0.63) indicates spatial autocorrelation

moran.test(Duval.sp$logCOMMUTE, 
           wts)
## 
##  Moran I test under randomisation
## 
## data:  Duval.sp$logCOMMUTE  
## weights: wts    
## 
## Moran I statistic standard deviate = 7.7339, p-value = 5.215e-15
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic       Expectation          Variance 
##       0.322859172      -0.005076142       0.001797951
##Highly significant spatial autocorrelation. Lets test using Monte Carlo approach without the assumption of normal distribution.

set.seed(4102)
mP <- moran.mc(Duval.sp$logCOMMUTE, 
               listw = wts,
               nsim = 99)
mP
## 
##  Monte-Carlo simulation of Moran I
## 
## data:  Duval.sp$logCOMMUTE 
## weights: wts  
## number of simulations + 1: 100 
## 
## statistic = 0.32286, observed rank = 100, p-value = 0.01
## alternative hypothesis: greater
#The results confirm that there is significant spatial autocorrelation


#CHECK Residuals from the Model

res <- residuals(modelD)

sm::sm.density(res, 
               model = "Normal")

#The model residuals does not have normal distribution

Duval.sf$res <- res

tm_shape(Duval.sf) +
  tm_fill("res") +
  tm_borders(col = "gray70") +
  tm_layout(title = "Linear Model Residuals")
## Variable "res" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.

#the model over predicts by a higher margin (-3, -4) giving the residual longer left tail

There are clustered regions where the model over predicts logCommute based on explanatory variables (red and orange regions) and clustering is not evident where it under predicts logCommute (green).

Fit a Spatial Regression Model

#Test spatial regression fit

lm.LMtests(modelD, 
           listw = wts, 
           test = c("LMerr", "LMlag"))
## 
##  Lagrange multiplier diagnostics for spatial dependence
## 
## data:  
## model: lm(formula = logCOMMUTE ~ PCT_POV + SUM_ED_LES + MEAN_MEDOO
## + PCT_HISP + SUM_VEHICL + POPDEN, data = Duval.sf)
## weights: wts
## 
## LMerr = 25.584, df = 1, p-value = 4.235e-07
## 
## 
##  Lagrange multiplier diagnostics for spatial dependence
## 
## data:  
## model: lm(formula = logCOMMUTE ~ PCT_POV + SUM_ED_LES + MEAN_MEDOO
## + PCT_HISP + SUM_VEHICL + POPDEN, data = Duval.sf)
## weights: wts
## 
## LMlag = 21.15, df = 1, p-value = 4.247e-06
#Both are significant, we try more robust test

lm.LMtests(modelD,
           listw = wts,
           test = c("RLMerr", "RLMlag"))
## 
##  Lagrange multiplier diagnostics for spatial dependence
## 
## data:  
## model: lm(formula = logCOMMUTE ~ PCT_POV + SUM_ED_LES + MEAN_MEDOO
## + PCT_HISP + SUM_VEHICL + POPDEN, data = Duval.sf)
## weights: wts
## 
## RLMerr = 4.5242, df = 1, p-value = 0.03342
## 
## 
##  Lagrange multiplier diagnostics for spatial dependence
## 
## data:  
## model: lm(formula = logCOMMUTE ~ PCT_POV + SUM_ED_LES + MEAN_MEDOO
## + PCT_HISP + SUM_VEHICL + POPDEN, data = Duval.sf)
## weights: wts
## 
## RLMlag = 0.090038, df = 1, p-value = 0.7641
##Robust test shows the error model as more fit (p-value is significant)

Fit with an spatial error model

model.errorD <- errorsarlm(logCOMMUTE ~ PCT_POV + SUM_ED_LES + MEAN_MEDOO + PCT_HISP + SUM_VEHICL + POPDEN, 
                           data = Duval.sf, listw = wts, tol.solve=1.0e-20)

summary(model.error)
## 
## Call:
## errorsarlm(formula = logCOMMUTE ~ PCT_POV + SUM_ED_LES + MEAN_MEDOO + 
##     PCT_HISP + SUM_VEHICL + POPDEN, data = CEN.sf, listw = wts, 
##     tol.solve = 1e-20)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -4.821058 -0.230161  0.087165  0.348925  2.108316 
## 
## Type: error 
## Coefficients: (asymptotic standard errors) 
##               Estimate Std. Error z value  Pr(>|z|)
## (Intercept) 3.9711e+00 8.5045e-02 46.6943 < 2.2e-16
## PCT_POV     9.6354e-03 2.6952e-03  3.5750 0.0003502
## SUM_ED_LES  4.3118e-04 3.2407e-05 13.3051 < 2.2e-16
## MEAN_MEDOO  7.5151e-07 1.8133e-07  4.1445 3.406e-05
## PCT_HISP    3.3347e-03 1.6663e-03  2.0012 0.0453656
## SUM_VEHICL  1.1295e-03 5.2315e-05 21.5907 < 2.2e-16
## POPDEN      5.1526e+00 2.3558e+00  2.1872 0.0287288
## 
## Lambda: 0.81072, LR test value: 2267.4, p-value: < 2.22e-16
## Asymptotic standard error: 0.011185
##     z-value: 72.484, p-value: < 2.22e-16
## Wald statistic: 5253.9, p-value: < 2.22e-16
## 
## Log likelihood: -4091.724 for error model
## ML residual variance (sigma squared): 0.3463, (sigma: 0.58847)
## Number of observations: 4244 
## Number of parameters estimated: 9 
## AIC: 8201.4, (AIC for lm: 10467)
x <- 2 * (logLik(model.errorD) - logLik(modelD))/198
x[1]
## [1] 0.1552354
#The difference is considered large 

res <- residuals(model.errorD)

sm::sm.density(res, 
               model = "Normal")

1 | huge, .1 | large, .01 | good, .001 | okay

Predictions

The predict() method implements the predict.errorsarlm() function to calculate predictions from the spatial regression model. The prediction is decomposed into a “trend” term (explanatory variable effect) and a “signal” term (spatial smoother). The fit is the sum of the trend and the signal terms.

We make predictions at the values of the explanatory variables using the spatial error model

pre <- predict(model.errorD)
str(pre)
##  'sarlm.pred' Named num [1:198] 5.63 5.49 5.63 6.04 5.74 ...
##  - attr(*, "names")= chr [1:198] "1" "2" "3" "4" ...
##  - attr(*, "trend")= num [1:198] 5.7 5.54 5.63 6.13 5.39 ...
##  - attr(*, "signal")= num [1:198] -0.06905 -0.0505 0.00336 -0.09018 0.35277 ...
##  - attr(*, "region.id")= chr [1:198] "1" "2" "3" "4" ...
##  - attr(*, "pred.type")= chr "TS"
##  - attr(*, "call")= language predict.sarlm(object = model.errorD)
Duval.sp$logCOMMUTE[1:5]
## [1] 5.843544 5.488938 5.521461 6.272877 5.529429

The predictions are added to the spatial polygons data frame.

Duval.sf$fit <- as.numeric(pre)
Duval.sf$trend <- attr(pre, "trend")
Duval.sf$signal <- attr(pre, "signal")

The predictions are mapped.

( g3 <- ggplot() +
    geom_sf(data = Duval.sf, aes(fill = fit)) +
    scale_fill_gradient(low = "white", high = "green") +
    ggtitle("Predicted Commute(log)", 
          subtitle = "Number of Workers that Commutes(log)") )

( g4 <- ggplot() +
    geom_sf(data = Duval.sf, aes(fill = trend)) +
    scale_fill_gradient(low = "white", high = "orange") +
    ggtitle("Trend (Covariate)") )

( g5 <- ggplot() +
    geom_sf(data = Duval.sf, aes(fill = signal)) +
    scale_fill_gradient(low = "white", high = "blue") +
    ggtitle("Signal") )

grid.arrange(g3, g4, g5, nrow = 1)

*Commuting is spatially biased towards Urban areas, meaning there are more people riding the public transport, biking, and walking in urban areas. This maybe because of infrastructure availability or relative distance of home to work. Also, there is a lot more commuting going on in the South of Florida which can be explained by land use, type of work, and other cultural factors.

*The linear model also relatively under predicted commuting in the south of Florida more.

*NOTE: I encountered a Modifiable Areal Unit Problem (MAUP), specifically the problem of small area estimates when I first started this project becuase the data is disaggregated at the Census Block level. This means getting a lot of zeroes in the commuting or a lot of variability. I have to aggregate at the Census tract level to have a better estimate that is useful for some analysis.